Load Libraries

library(forecast)
library(dplyr)
library(tidyr)
library(stringr)
library(lubridate)
library(ggplot2)
theme_set(theme_bw())
library(weatherGen)
data(climate)

# set RNG seed for reproducibility
set.seed(2345)

Load Monthly Climate Data

This analysis will use the gridded climate dataset by Maurer et al, 2002. The data have already been downloaded and converted to an SQLite database using the makefile in walkerjeffd/climate-data/maurer. After following the instructions in this repo, there should be an SQLite database called maurer/db/maurer_mon.db, which contains a grid table containing the grid points, and a data table containing the monthly timeseries for each grid point. See the vignette for examples on how to use this database.

DB_PATH <- '/Users/jeff/Projects/UMass/climate-data/maurer/db/maurer_mon.db'
db <- src_sqlite(DB_PATH, create = FALSE)

df.grid <- tbl(db, "grid") %>%
  collect

Here is a map of the climate data grid points.

library(ggmap)
map <- get_map(location=c(lon=mean(range(df.grid$LON)), lat=mean(range(df.grid$LAT))),
               zoom=4, maptype="satellite", color='bw')
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=36.625,-78.8125&zoom=4&size=%20640x640&scale=%202&maptype=satellite&sensor=false
## Google Maps API Terms of Service : http://developers.google.com/maps/terms
ggmap(map, darken=c(0.25, "white"), extent="device") +
  geom_point(aes(x=LON, y=LAT), data=df.grid, color='red', size=1)

plot of chunk map east

Zoomed into new england. The green points will be used to compute a regional average.

map <- get_map(location=c(lon=-72, lat=42),
               zoom=7, maptype="satellite", color='bw')
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=42,-72&zoom=7&size=%20640x640&scale=%202&maptype=satellite&sensor=false
## Google Maps API Terms of Service : http://developers.google.com/maps/terms
ggmap(map, darken=c(0.25, "white"), extent="device") +
  geom_point(aes(x=LON, y=LAT, color=SELECT), 
             data=mutate(df.grid, SELECT=(LAT>=41 & LAT<=43 & LON>=-74 & LON<=-70)), size=2) +
  geom_vline(aes(xintercept=LON), color='green', linetype=2, 
             data=data.frame(LON=seq(-74, -70))) +
  geom_hline(aes(yintercept=LAT), color='green', linetype=2, 
             data=data.frame(LAT=seq(41, 43))) +
  scale_color_manual(values=c("TRUE"="green", "FALSE"="red"), guide=FALSE)

plot of chunk map new england

Monthly Climate Dataset

Using the selected grid points, retrieve the monthly timeseries from the database. Also, compute the variable TAVG as the arithmetic mean of TMIN and TMAX.

clim.mon <- tbl(db, "data") %>%
  filter(LAT>=41, LAT<=43, LON>=-74, LON<=-70) %>%
  collect %>%
  select(-REGION, -WIND) %>%
  mutate(TAVG=(TMIN+TMAX)/2) %>%
  gather(VAR, VALUE, PRCP, TMIN, TMAX, TAVG)
summary(clim.mon)
##       YEAR          MONTH            LAT            LON       
##  Min.   :1949   Min.   : 1.00   Min.   :41.1   Min.   :-73.9  
##  1st Qu.:1964   1st Qu.: 3.75   1st Qu.:41.7   1st Qu.:-73.3  
##  Median :1980   Median : 6.50   Median :42.1   Median :-72.4  
##  Mean   :1980   Mean   : 6.50   Mean   :42.1   Mean   :-72.4  
##  3rd Qu.:1995   3rd Qu.: 9.25   3rd Qu.:42.6   3rd Qu.:-71.7  
##  Max.   :2010   Max.   :12.00   Max.   :42.9   Max.   :-70.1  
##    VAR             VALUE      
##  PRCP:267840   Min.   :-22.1  
##  TMIN:267840   1st Qu.:  3.2  
##  TMAX:267840   Median : 14.2  
##  TAVG:267840   Mean   : 31.3  
##                3rd Qu.: 28.3  
##                Max.   :705.8

Compute annual timeseries of each climate variable and each location. Note that precipitation is simply the annual sum, while the other three variables are computed as weighted means based on the number of days in each month to get a correct annual mean.

clim.yr <- spread(clim.mon, VAR, VALUE) %>%
  mutate(MONTHYEAR=ymd(paste(YEAR,MONTH,1,sep='-')),
         N_DAY=days_in_month(MONTHYEAR)) %>%
  group_by(LAT, LON, YEAR) %>%
  summarise(PRCP=sum(PRCP),
            TMAX=sum(TMAX*N_DAY)/sum(N_DAY),
            TMIN=sum(TMIN*N_DAY)/sum(N_DAY),
            TAVG=sum(TAVG*N_DAY)/sum(N_DAY)) %>%
  gather(VAR, VALUE, PRCP:TAVG)
summary(clim.yr)
##       LAT            LON             YEAR        VAR       
##  Min.   :41.1   Min.   :-73.9   Min.   :1949   PRCP:22320  
##  1st Qu.:41.7   1st Qu.:-73.3   1st Qu.:1964   TMAX:22320  
##  Median :42.1   Median :-72.4   Median :1980   TMIN:22320  
##  Mean   :42.1   Mean   :-72.4   Mean   :1980   TAVG:22320  
##  3rd Qu.:42.6   3rd Qu.:-71.7   3rd Qu.:1995               
##  Max.   :42.9   Max.   :-70.1   Max.   :2010               
##      VALUE       
##  Min.   :  -3.5  
##  1st Qu.:   6.1  
##  Median :  11.6  
##  Mean   : 302.8  
##  3rd Qu.: 139.5  
##  Max.   :1978.9

Regional Average

Compute the regional average monthly value of each climate variable.

clim.reg.mon <- clim.mon %>%
  group_by(YEAR, MONTH, VAR) %>%
  summarise(MEAN=mean(VALUE),
            SD=sd(VALUE))
summary(clim.reg.mon)
##       YEAR          MONTH         VAR           MEAN      
##  Min.   :1949   Min.   : 1.00   PRCP:744   Min.   :-14.9  
##  1st Qu.:1964   1st Qu.: 3.75   TMIN:744   1st Qu.:  3.0  
##  Median :1980   Median : 6.50   TMAX:744   Median : 14.4  
##  Mean   :1980   Mean   : 6.50   TAVG:744   Mean   : 31.3  
##  3rd Qu.:1995   3rd Qu.: 9.25              3rd Qu.: 28.6  
##  Max.   :2010   Max.   :12.00              Max.   :373.9  
##        SD        
##  Min.   :  0.81  
##  1st Qu.:  1.29  
##  Median :  1.64  
##  Mean   :  7.10  
##  3rd Qu.:  3.21  
##  Max.   :110.64

This figure shows the regional average monthly climate timeseries. The shaded region shows mean +/- 1 standard deviation.

mutate(clim.reg.mon, DATE=ymd(paste(YEAR,MONTH,1,sep='-'))) %>%
  ggplot(aes(DATE, MEAN)) +
  geom_ribbon(aes(ymin=MEAN-SD, ymax=MEAN+SD), fill='grey50') +
  geom_line(color='blue') +
  facet_wrap(~VAR, ncol=1, scales='free_y') +
  labs(x="Month/Year", y="Mean +/- StDev")

plot of chunk plot clim reg mon

Compute the regional average annual value of each climate variable.

clim.reg.yr <-  clim.yr %>%
  group_by(VAR, YEAR) %>%
  summarise(MEAN=mean(VALUE),
            SD=sd(VALUE))
summary(clim.reg.yr)
##    VAR          YEAR           MEAN              SD        
##  PRCP:62   Min.   :1949   Min.   :   1.8   Min.   :  1.01  
##  TMAX:62   1st Qu.:1964   1st Qu.:   7.1   1st Qu.:  1.25  
##  TMIN:62   Median :1980   Median :  11.9   Median :  1.53  
##  TAVG:62   Mean   :1980   Mean   : 302.8   Mean   : 30.52  
##            3rd Qu.:1995   3rd Qu.: 203.6   3rd Qu.: 19.48  
##            Max.   :2010   Max.   :1520.8   Max.   :208.59

This figure shows the regional-average annual climate timeseries. The shaded region shows the mean +/- 1 standard deviation.

ggplot(clim.reg.yr, aes(YEAR, MEAN)) +
  geom_ribbon(aes(ymin=MEAN-SD, ymax=MEAN+SD), fill='grey80') +
  geom_line(color='blue') +
  facet_wrap(~VAR, ncol=1, scales='free_y') +
  labs(x="Year", y="Mean +/- StDev")

plot of chunk plot clim reg yr

Convert the regional average datasets wide format for use in the weather generator.

clim.reg.mon <- select(clim.reg.mon, -SD) %>%
  spread(VAR, MEAN)
head(clim.reg.mon)
## Source: local data frame [6 x 6]
## 
##   YEAR MONTH   PRCP   TMIN   TMAX    TAVG
## 1 1949     1 117.52 -4.818  4.243 -0.2873
## 2 1949     2  77.78 -6.123  4.882 -0.6205
## 3 1949     3  48.22 -3.279  8.025  2.3733
## 4 1949     4  98.86  2.510 15.229  8.8692
## 5 1949     5 108.86  7.175 21.098 14.1365
## 6 1949     6  23.19 13.578 27.450 20.5140
clim.reg.yr <- select(clim.reg.yr, -SD) %>%
  spread(VAR, MEAN)
head(clim.reg.yr)
## Source: local data frame [6 x 5]
## 
##   YEAR   PRCP  TMAX  TMIN   TAVG
## 1 1949  912.9 16.22 4.169 10.194
## 2 1950 1075.9 14.50 2.944  8.724
## 3 1951 1281.2 14.94 3.561  9.249
## 4 1952 1168.9 15.13 3.754  9.443
## 5 1953 1329.0 16.07 4.205 10.139
## 6 1954 1291.9 14.66 3.370  9.014

Wavelet Analysis

Perform wavelet analysis on regional average annual precipitation timeseries.

wt.reg <- wavelet_analysis(clim.reg.yr$PRCP, sig.level=0.90, noise.type='white')

par(mfrow=c(1,2))
plot(wt.reg, plot.cb=TRUE, plot.phase=FALSE)
plot(wt.reg$gws, wt.reg$period, type="b",
     xlab="Global Wavelet Spectrum", ylab="",
     log="y", ylim=rev(range(wt.reg$period)), 
     xlim=range(c(wt.reg$gws, wt.reg$gws.sig$signif)))
lines(wt.reg$gws.sig$signif, wt.reg$period, lty=2, col='red')

plot of chunk plot wave

Although there are some significant wavelet periods, we’ll use a simple arima model of the regional annual precipitation instead of arima models of the wavelet components.

models <- list(PRCP=auto.arima(clim.reg.yr$PRCP,
                               max.p=2,max.q=2,max.P=0,max.Q=0,
                               stationary=TRUE))
models[["PRCP"]]
## Series: clim.reg.yr$PRCP 
## ARIMA(1,0,0) with non-zero mean 
## 
## Coefficients:
##         ar1  intercept
##       0.218    1183.87
## s.e.  0.126      26.69
## 
## sigma^2 estimated as 27243:  log likelihood=-404.6
## AIC=815.2   AICc=815.6   BIC=821.6

This figure shows a 20-year forecast of the arima model reflecting no low-frequency oscillations (since we are not using the wavelet decomposition).

par(mfrow=c(1,1))
plot(forecast(models[['PRCP']], h=20), main="ARIMA Forecast of Regional Annual Precip",
     xlab="Timestep (yr)", ylab='Annual Precip (mm/yr)')

plot of chunk plot arima forecast

Generate 50 simulations of annual precipitation using the AR1 model of the regional annual precipitation.

sim.prcp <- weatherGen::mc_arimas(models=models, n=nrow(clim.reg.yr), n.iter=50)
str(sim.prcp)
## List of 4
##  $ x       : num [1:62, 1:50] 1063 1355 1381 1024 1460 ...
##  $ gws     : num [1:21, 1:50] 9569 14863 21023 27508 23138 ...
##  $ x.stat  : num [1:62, 1:3] 1168 1178 1188 1182 1185 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr [1:3] "MEAN" "Q025" "Q975"
##  $ gws.stat: num [1:21, 1:3] 12705 20034 21128 22864 24353 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr [1:3] "MEAN" "Q025" "Q975"

Compare the GWS power and the mean, standard deviation, and skewness statistics across these simulations (red point is the overall mean, boxplots are the distribution across simulations) to regional annual precipitation timeseries (blue point).

par(mfrow=c(2,2))
x1 <- 1
x2 <- min(length(wt.reg$period), nrow(sim.prcp$gws))
ymin <- min(wt.reg$period[x1:x2], wt.reg$gws[x1:x2],
            sim.prcp$gws.stat[x1:x2, 'MEAN'],
            sim.prcp$gws.stat[x1:x2, 'Q025'])
ymax <- max(wt.reg$period[x1:x2], wt.reg$gws[x1:x2],
            sim.prcp$gws.stat[x1:x2, 'MEAN'],
            sim.prcp$gws.stat[x1:x2, 'Q975'])
plot(wt.reg$period[x1:x2], wt.reg$gws[x1:x2],
     type="n", ylim=c(ymin,ymax), xlab="Period (years)", ylab="",
     main="", log="y")
mtext(side=2, expression(paste("Power (",mm^2,")")), line=2.5)
polygon(c(wt.reg$period[x1:x2],
          rev(wt.reg$period[x1:x2])),
        c(sim.prcp$gws.stat[x1:x2, 'Q025'],
          rev(sim.prcp$gws.stat[x1:x2, 'Q975'])),
        col="grey")
lines(wt.reg$period[x1:x2], wt.reg$gws[x1:x2])
lines(wt.reg$period[x1:x2], sim.prcp$gws.stat[x1:x2, 'MEAN'], lty=2, col="blue")
lines(wt.reg$period[x1:x2], wt.reg$gws.sig$signif[x1:x2], col="red", lty=3)

boxplot(colMeans(sim.prcp$x), main="Mean")
points(mean(clim.reg.yr$PRCP),col="red",pch=19)
points(mean(colMeans(sim.prcp$x)), col="blue", pch=19)

boxplot(apply(sim.prcp$x, 2, sd), main="Standard Deviation")
points(sd(clim.reg.yr$PRCP), col="red", pch=19)
points(mean(apply(sim.prcp$x, 2, sd)), col="blue", pch=19)

boxplot(apply(sim.prcp$x, 2, skewness), main="Skew")
points(skewness(clim.reg.yr$PRCP), col="red", pch=19)
points(mean(apply(sim.prcp$x, 2, skewness)),col="blue",pch=19)

plot of chunk plot gws stats

This figure shows the simulated and observed regional-average annual precipitation. The simulations were generated using the AR1 model described above.

as.data.frame(sim.prcp$x) %>%
  mutate(TIMESTEP=row_number()) %>% 
  gather(TRIAL, VALUE, -TIMESTEP) %>% 
  ggplot(aes(TIMESTEP, VALUE)) + 
  geom_line(aes(group=TRIAL), alpha=0.3) + 
  stat_summary(fun.y=mean, geom='line', color='red', size=1) +
  geom_line(aes(x=TIMESTEP, y=PRCP), data=mutate(clim.reg.yr, TIMESTEP=row_number()), color='blue', size=1) +
  labs(x="Year", y="Annual Precipitation (mm/yr)", 
       title="WARM Annual Precipitation Simulations\nRed Line=Mean of Simulations, Blue Line=Observd Regional Mean")

plot of chunk plot ar sims

Monthly Weather Generator

Select Location

First, select a random location from the complete climate dataset.

clim.locs <- select(clim.yr, LAT, LON) %>%
  unique()
loc <- clim.locs[sample(nrow(clim.locs), size=1),] %>% as.numeric
names(loc) <- c('LAT', 'LON')
loc
##    LAT    LON 
##  42.19 -72.56

Now extract the monthly and annual climate datasets for this location.

clim.loc.yr <- filter(clim.yr, LAT==loc[['LAT']], LON==loc[['LON']]) %>%
  spread(VAR, VALUE) %>%
  arrange(YEAR)
clim.loc.mon <- filter(clim.mon, LAT==loc[['LAT']], LON==loc[['LON']]) %>%
  spread(VAR, VALUE) %>%
  arrange(YEAR, MONTH)

This figure shows the original climate timeseries for the regional-average dataset and the selected location.

clim.loc.mon %>%
  select(-LAT, -LON) %>%
  mutate(SOURCE="Location") %>%
  rbind(clim.reg.mon %>% mutate(SOURCE="Region")) %>%
  mutate(SOURCE=factor(SOURCE)) %>%
  gather(VAR, VALUE, PRCP:TAVG) %>%
  mutate(DATE=ymd(paste(YEAR, MONTH, 1, sep='-'))) %>%
  ggplot(aes(DATE, VALUE, color=SOURCE)) +
  geom_line() +
  labs(x="Month/Year", y="") +
  scale_color_manual('', values=c('steelblue', 'orangered')) +
  facet_wrap(~VAR, ncol=1, scales='free_y')

plot of chunk plot clim loc reg

Nearest Neighbors

Compute the euclidian distance between a single simulated annual precipitation value and each observed year. The simulated annual precipitation is for the first year in the first simulation trial.

j <- 1 # year number
samp <- 1 # simulation trial number

# compute distances between simulated WARM timeseries and observed regional precip
distances <- cbind(clim.reg.yr, DISTANCE=sqrt((sim.prcp$x[j,samp] - clim.reg.yr$PRCP)^2))

ggplot(distances, aes(YEAR, PRCP)) + 
  geom_line() +
  geom_point(aes(color=DISTANCE), size=3) +
  geom_hline(yint=sim.prcp$x[j,samp], linetype=2, color='red') +
  geom_text(x=max(distances$YEAR), y=sim.prcp$x[j,samp], label="Sim", hjust=1, vjust=-0.5) +
  scale_color_gradient(low='green', high='red') +
  labs(x="Year", y="Annual Precipitation (mm/yr)", 
       title="Regional and Current Simulation Annual Precipitation")

plot of chunk distances

Select the eight years from the observed regional annual simulation that are the most similar to the first simulation trial and year.

distances <- mutate(distances,
                    IN_TOP_8=rank(DISTANCE) %in% seq(1,8))
distances %>%
  ggplot(aes(YEAR, PRCP)) + 
  geom_line() +
  geom_point(aes(color=DISTANCE), size=3) +
  geom_hline(yint=sim.prcp$x[j,samp], linetype=2, color='red') +
  geom_point(mapping=aes(alpha=IN_TOP_8), color='black', shape=1, size=4) +
  geom_text(aes(x=YEAR, y=PRCP, label=YEAR), data=filter(distances, IN_TOP_8), vjust=-1) +
  geom_text(x=max(distances$YEAR), y=sim.prcp$x[j,samp], label="Sim", hjust=1, vjust=-0.5) +
  scale_alpha_manual(values=c("TRUE"=1, "FALSE"=0), guide=FALSE) +
  scale_color_gradient(low='green', high='red') +
  labs(x="Year", y="Annual Precipitation (mm)", title="Regional Annual Precipitation with 8 Nearest Neighbors Selected")

plot of chunk plot top distances

Extract the observed regional precipitation for the candidate years and compute the sampling weights as inverse distance-squared weighted.

distances.top <- filter(distances, IN_TOP_8) %>%
  mutate(WEIGHT=(1/DISTANCE)^2,
         WEIGHT=WEIGHT/sum(WEIGHT))
distances.top %>%
  ggplot(aes(PRCP, WEIGHT)) +
  geom_point() +
  geom_vline(xint=sim.prcp$x[j,samp], linetype=2, color='red') +
  geom_text(aes(label=YEAR), hjust=-0.1) +
  geom_text(x=sim.prcp$x[j,samp], y=0.4, label="Sim Precip", hjust=-0.1) +
  labs(x="Annual Precipitation (mm)", y="Sample Weight")

plot of chunk top distances

Now we can randomly sample from the 8 nearest neighbors using the sampling weights based on the inverse distance squared. We’ll do this 1000 times and compare the frequency of the samples to the sample weights.

sampled.years <- sample(distances.top$YEAR,
                        size=1000,
                        prob=distances.top$WEIGHT,
                        replace=TRUE)
sampled.years.tbl <- prop.table(table(sampled.years))
merge(distances.top, 
      data.frame(YEAR=names(sampled.years.tbl),
                 FREQ=as.vector(sampled.years.tbl)),
      all.x=TRUE) %>%
  select(YEAR, WEIGHT, FREQ) %>%
  ggplot(aes(WEIGHT, FREQ)) +
  geom_point(size=3) +
  geom_abline(linetype=2) +
  labs(x="Sample Weight", y="Sampled Frequency")

plot of chunk sample years

Now if we just sample for one year.

sampled.year <- sample(distances.top$YEAR, size=1, prob=distances.top$WEIGHT, replace=TRUE)
sampled.year
## [1] 1988

Then we can extract the observed monthly climate data for the sampled year from the observed dataset of the selected location.

filter(clim.loc.mon, YEAR==sampled.year)
## Source: local data frame [12 x 8]
## 
##    YEAR MONTH   LAT    LON   PRCP   TMIN  TMAX   TAVG
## 1  1988     1 42.19 -72.56  70.10 -12.27  0.82 -5.725
## 2  1988     2 42.19 -72.56 114.40  -9.54  3.15 -3.195
## 3  1988     3 42.19 -72.56  65.47  -4.25  8.95  2.350
## 4  1988     4 42.19 -72.56  86.18   2.33 13.58  7.955
## 5  1988     5 42.19 -72.56  95.32   8.04 21.73 14.885
## 6  1988     6 42.19 -72.56  54.65  10.93 26.58 18.755
## 7  1988     7 42.19 -72.56 174.73  16.68 30.01 23.345
## 8  1988     8 42.19 -72.56 115.75  15.87 29.60 22.735
## 9  1988     9 42.19 -72.56  39.65   8.05 23.39 15.720
## 10 1988    10 42.19 -72.56  62.00   1.47 14.67  8.070
## 11 1988    11 42.19 -72.56 159.30  -1.09 11.33  5.120
## 12 1988    12 42.19 -72.56  24.10  -8.07  3.63 -2.220

Generate Complete Timeseries

Given the selected location, repeat the nearest neighbor process for each of the num_sim simulations and num_year years, and store the result in a 3-dimensional array.

num_sim <- ncol(sim.prcp$x)
num_year <- nrow(sim.prcp$x) # number of composited years
num_months <- num_year*12

climate_variables <- select(clim.reg.yr, -YEAR) %>%
  names()

sampled.year <- array(NA, c(num_year, num_sim))
gen.loc.mon.arr <- array(NA, c(num_months, length(climate_variables), num_sim))
# loop through simulations (trials)
for (samp in 1:num_sim) {
    # loop through years
    for (j in 1:num_year) {
        # compute distances between simulated WARM timeseries and observed regional precip
      distances <- mutate(clim.reg.yr, 
                          DISTANCE=sqrt((sim.prcp$x[j, samp] - clim.reg.yr$PRCP)^2),
                          IN_TOP_8=rank(DISTANCE) %in% seq(1,8))
      distances.top <- filter(distances, IN_TOP_8) %>%
                       mutate(WEIGHT=(1/DISTANCE)^2,
                              WEIGHT=WEIGHT/sum(WEIGHT))
      sampled.year[j,samp] <- sample(distances.top$YEAR,
                                     size=1,
                                     prob=distances.top$WEIGHT,
                                     replace=FALSE)
      row.idx <- ((1+12*(j-1)):(12+12*(j-1)))
      gen.loc.mon.arr[row.idx, , samp] <- data.matrix(filter(clim.loc.mon, 
                                                             YEAR==sampled.year[j, samp]) %>%
                                                      select(PRCP, TMAX, TMIN, TAVG))
    }
}

Now convert the 3-dim array to a list of dataframes with each element in the list containing the simulated data for a single variable.

gen.loc.mon <- sapply(climate_variables, function(v) {
  i.var <- which(climate_variables==v)
  df <- as.data.frame(gen.loc.mon.arr[,i.var,]) %>%
    mutate(TIMESTEP=row_number(),
           SIM_YEAR=rep(seq(1, num_year), each=12),
           SIM_MONTH=rep(seq(1, 12), times=num_year),
           VAR=v) %>%
    gather(TRIAL, VALUE, -TIMESTEP, -SIM_YEAR, -SIM_MONTH, -VAR) %>%
    mutate(TRIAL=as.numeric(TRIAL))
  return(list(df))
})
str(gen.loc.mon)
## List of 4
##  $ PRCP:'data.frame':    37200 obs. of  6 variables:
##   ..$ TIMESTEP : int [1:37200] 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ SIM_YEAR : int [1:37200] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ SIM_MONTH: int [1:37200] 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ VAR      : chr [1:37200] "PRCP" "PRCP" "PRCP" "PRCP" ...
##   ..$ TRIAL    : num [1:37200] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ VALUE    : num [1:37200] 70.1 114.4 65.5 86.2 95.3 ...
##  $ TMAX:'data.frame':    37200 obs. of  6 variables:
##   ..$ TIMESTEP : int [1:37200] 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ SIM_YEAR : int [1:37200] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ SIM_MONTH: int [1:37200] 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ VAR      : chr [1:37200] "TMAX" "TMAX" "TMAX" "TMAX" ...
##   ..$ TRIAL    : num [1:37200] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ VALUE    : num [1:37200] 0.82 3.15 8.95 13.58 21.73 ...
##  $ TMIN:'data.frame':    37200 obs. of  6 variables:
##   ..$ TIMESTEP : int [1:37200] 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ SIM_YEAR : int [1:37200] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ SIM_MONTH: int [1:37200] 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ VAR      : chr [1:37200] "TMIN" "TMIN" "TMIN" "TMIN" ...
##   ..$ TRIAL    : num [1:37200] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ VALUE    : num [1:37200] -12.27 -9.54 -4.25 2.33 8.04 ...
##  $ TAVG:'data.frame':    37200 obs. of  6 variables:
##   ..$ TIMESTEP : int [1:37200] 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ SIM_YEAR : int [1:37200] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ SIM_MONTH: int [1:37200] 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ VAR      : chr [1:37200] "TAVG" "TAVG" "TAVG" "TAVG" ...
##   ..$ TRIAL    : num [1:37200] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ VALUE    : num [1:37200] -5.72 -3.19 2.35 7.96 14.88 ...

Aggregate the generated timeseries to compute annual values or each variable.

gen.loc.yr <- lapply(gen.loc.mon, function(df) {
  if ('PRCP' %in% unique(df$VAR)) {
    df.yr <- group_by(df, VAR, TRIAL, SIM_YEAR) %>%
      summarise(VALUE=sum(VALUE))
  } else {
    df.yr <- group_by(df, VAR, TRIAL, SIM_YEAR) %>%
      summarise(VALUE=mean(VALUE))
  }
  df.yr <- df.yr %>%
    ungroup() %>%
    as.data.frame()
  return(df.yr)
})
str(gen.loc.yr)
## List of 4
##  $ PRCP:'data.frame':    3100 obs. of  4 variables:
##   ..$ VAR     : chr [1:3100] "PRCP" "PRCP" "PRCP" "PRCP" ...
##   ..$ TRIAL   : num [1:3100] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ SIM_YEAR: int [1:3100] 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ VALUE   : num [1:3100] 1062 1274 1458 958 1245 ...
##  $ TMAX:'data.frame':    3100 obs. of  4 variables:
##   ..$ VAR     : chr [1:3100] "TMAX" "TMAX" "TMAX" "TMAX" ...
##   ..$ TRIAL   : num [1:3100] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ SIM_YEAR: int [1:3100] 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ VALUE   : num [1:3100] 15.6 16.9 15 15.5 15.7 ...
##  $ TMIN:'data.frame':    3100 obs. of  4 variables:
##   ..$ VAR     : chr [1:3100] "TMIN" "TMIN" "TMIN" "TMIN" ...
##   ..$ TRIAL   : num [1:3100] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ SIM_YEAR: int [1:3100] 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ VALUE   : num [1:3100] 2.35 4.49 3.37 2.78 3.55 ...
##  $ TAVG:'data.frame':    3100 obs. of  4 variables:
##   ..$ VAR     : chr [1:3100] "TAVG" "TAVG" "TAVG" "TAVG" ...
##   ..$ TRIAL   : num [1:3100] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ SIM_YEAR: int [1:3100] 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ VALUE   : num [1:3100] 8.98 10.67 9.16 9.13 9.62 ...

This figure shows all the weather generator simulations compared to the observed climate dataset at the selected location.

do.call(rbind, gen.loc.yr) %>%
  mutate(VAR=factor(VAR)) %>%
  ggplot() +
    geom_line(aes(SIM_YEAR, VALUE, group=TRIAL), alpha=0.5) +
    geom_line(aes(SIM_YEAR, VALUE), 
              data=mutate(clim.reg.yr, SIM_YEAR=row_number()) %>%
                gather(VAR, VALUE, PRCP:TAVG), 
              color='red') +
    facet_wrap(~VAR, scales='free_y') +
    labs(x="Simulation Year", y="Value", title="All Generated Datasets with Observed Location Dataset ")

plot of chunk plot gen loc yr

This figure shows only the first weather generator simulation compared to the observed climate dataset at the selected location.

do.call(rbind, gen.loc.yr) %>%
  mutate(VAR=factor(VAR)) %>%
  filter(TRIAL==1) %>%
  ggplot() +
    geom_line(aes(SIM_YEAR, VALUE, group=TRIAL), alpha=0.5) +
    geom_line(aes(SIM_YEAR, VALUE), 
              data=mutate(clim.reg.yr, SIM_YEAR=row_number()) %>%
                gather(VAR, VALUE, PRCP:TAVG), 
              color='red') +
    facet_wrap(~VAR, scales='free_y') +
    labs(x="Simulation Year", y="Value", title="Single Generated Dataset with Observed Location Dataset ")

plot of chunk plot gen loc yr one

The following three figures compare the mean, standard deviation, and skewness of the annual precipitation timeseries between the weather generator simulations and the observed climate dataset at the selected location. The boxplots show the distribution of each statistic across the simulations, the red point shows the mean of the statistic for the simulations, and the blue point shows the statistic value for the observed annual timeseries.

do.call(rbind, gen.loc.yr) %>%
  mutate(VAR=factor(VAR)) %>%
  group_by(VAR, TRIAL) %>%
  summarise(MEAN=mean(VALUE)) %>%
  ggplot(aes(x=1, y=MEAN)) +
    geom_boxplot() +
    stat_summary(fun.y=mean, geom="point", color="red", size=3) +
    geom_point(aes(x=1, y=MEAN), 
               data=gather(clim.loc.yr, VAR, VALUE, PRCP:TAVG) %>% 
                      group_by(VAR) %>% 
                      summarise(MEAN=mean(VALUE)),
               color='blue', size=3) +
    facet_wrap(~VAR, scales='free_y') +
    labs(x="", y="Mean", title="Mean of Annual Precipitation for Generated (Red) and Observed (Blue)") +
    theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())

plot of chunk plot gen loc yr box mean

do.call(rbind, gen.loc.yr) %>%
  mutate(VAR=factor(VAR)) %>%
  group_by(VAR, TRIAL) %>%
  summarise(SD=sd(VALUE)) %>%
  ggplot(aes(x=1, y=SD)) +
    geom_boxplot() +
    stat_summary(fun.y=mean, geom="point", color="red", size=3) +
    geom_point(aes(x=1, y=SD), 
               data=gather(clim.loc.yr, VAR, VALUE, PRCP:TAVG) %>% 
                      group_by(VAR) %>% 
                      summarise(SD=sd(VALUE)),
               color='blue', size=3) +
    facet_wrap(~VAR, scales='free_y') +
    labs(x="", y="Standard Deviation", title="StDev of Annual Precipitation for Generated (Red) and Observed (Blue)") +
    theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())

plot of chunk plot gen loc yr box sd

do.call(rbind, gen.loc.yr) %>%
  mutate(VAR=factor(VAR)) %>%
  group_by(VAR, TRIAL) %>%
  summarise(SKEW=skewness(VALUE)) %>%
  ggplot(aes(x=1, y=SKEW)) +
    geom_boxplot() +
    stat_summary(fun.y=mean, geom="point", color="red", size=3) +
    geom_point(aes(x=1, y=SKEW), 
               data=gather(clim.loc.yr, VAR, VALUE, PRCP:TAVG) %>% 
                      group_by(VAR) %>% 
                      summarise(SKEW=skewness(VALUE)),
               color='blue', size=3) +
    facet_wrap(~VAR, scales='free_y') +
    labs(x="", y="Skewness", title="Skewness of Annual Precipitation for Generated (Red) and Observed (Blue)") +
    theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())

plot of chunk plot gen loc yr box skew

weatherGen Package

The weatherGen::gen_mon_arma() function provides a wrapper around these steps to generate multiple monthly timeseries based on the regional annual precipitation timeseries and the location monthly climate timeseries. This function returns a list object containing the following elements:

  • $models: list of arma models (only 1 if using ARMA model instead of WARM model)
  • $sim.prcp.yr: output from weatherGen::mc_arimas() containing annual precipitation simulations from the ARMA model
  • $sim.mon: unnamed list of length n.iter with each element containing a data.frame with the generated monthly timeseries for the given location. The columns YEAR_OBS and PRCP_OBS_YEAR contain the year and corresponding annual precipitation sampled from the monthly timeseries of the location.
wgen_mon <- weatherGen::gen_month_arma(x.reg.yr=clim.reg.yr, x.loc.mon=clim.loc.mon,
                                       n.iter=50, n.year=nrow(clim.reg.yr))
str(wgen_mon, max.level=1)
## List of 3
##  $ models     :List of 1
##  $ sim.prcp.yr:List of 4
##  $ sim.mon    :List of 50

Here is an example of the simulated monthly timeseries for the first trial.

head(wgen_mon$sim.mon[[1]])
##   YEAR_SIM MONTH   PRCP  TMAX   TMIN   TAVG YEAR_OBS PRCP_OBS_YEAR
## 1        1     1  69.20  3.71  -5.72 -1.005     2007          1180
## 2        1     2  51.75 -0.58 -10.56 -5.570     2007          1180
## 3        1     3 125.60  7.19  -5.51  0.840     2007          1180
## 4        1     4 163.08 12.53   1.24  6.885     2007          1180
## 5        1     5  76.45 22.96   7.75 15.355     2007          1180
## 6        1     6  75.78 25.68  13.15 19.415     2007          1180

This figure compares the total annual precipitation for the generated monthly timeseries and the annual arma model simulation. Note that the differences between the two reflect the different annual precipitation in the sampled year for the monthly time series and the simulated year from the ARMA model.

wgen_mon$sim.mon[[1]] %>%
  group_by(YEAR_SIM) %>%
  summarise(PRCP=sum(PRCP),
            YEAR_OBS=mean(YEAR_OBS),
            PRCP_OBS=mean(PRCP_OBS_YEAR)) %>%
  gather(VAR, VALUE, PRCP, PRCP_OBS) %>%
  ggplot(aes(YEAR_SIM, VALUE, color=VAR)) +
  geom_line() +
  labs(x="Year", y="Annual Precipitation (mm)")

plot of chunk plot wgen year compare

This figure shows the first simulation trial of the monthly climate timeseries for the selected location.

wgen_mon$sim.mon[[1]] %>%
  select(YEAR_SIM, MONTH, PRCP:TAVG) %>%
  gather(VAR, VALUE, PRCP:TAVG) %>%
  mutate(DATE=ymd(paste(YEAR_SIM, MONTH, 1, sep='-'))) %>%
  ggplot(aes(DATE, VALUE)) +
  geom_line() +
  facet_wrap(~VAR, scales='free_y', ncol=1) +
  labs(x="Month/Year", y="")

plot of chunk plot wgen mon

Session Info

sessionInfo()
## R version 3.1.1 (2014-07-10)
## Platform: x86_64-apple-darwin13.1.0 (64-bit)
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] weatherGen_0.1        biwavelet_0.17.3      mapproj_1.2-2        
##  [4] maps_2.3-7            ggmap_2.3             RSQLite.extfuns_0.0.1
##  [7] RSQLite_0.11.4        DBI_0.2-7             stringr_0.6.2        
## [10] ggplot2_1.0.0         lubridate_1.3.3       tidyr_0.1            
## [13] dplyr_0.2             forecast_5.4          timeDate_3010.98     
## [16] zoo_1.7-11            devtools_1.5         
## 
## loaded via a namespace (and not attached):
##  [1] assertthat_0.1      car_2.0-20          cluster_1.15.2     
##  [4] colorspace_1.2-4    digest_0.6.4        evaluate_0.5.5     
##  [7] fields_7.1          formatR_0.10        Formula_1.1-1      
## [10] fracdiff_1.4-2      grid_3.1.1          gtable_0.1.2       
## [13] Hmisc_3.14-4        htmltools_0.2.4     httr_0.3           
## [16] hydromad_0.9-20     knitr_1.6           labeling_0.2       
## [19] lattice_0.20-29     latticeExtra_0.6-26 magrittr_1.0.1     
## [22] MASS_7.3-33         memoise_0.2.1       munsell_0.4.2      
## [25] nnet_7.3-8          parallel_3.1.1      plyr_1.8.1         
## [28] png_0.1-7           proto_0.3-10        quadprog_1.5-5     
## [31] RColorBrewer_1.0-5  Rcpp_0.11.2         RCurl_1.95-4.1     
## [34] reshape2_1.4        RgoogleMaps_1.2.0.6 rjson_0.2.14       
## [37] RJSONIO_1.2-0.2     rmarkdown_0.2.50    scales_0.2.4       
## [40] spam_0.41-0         splines_3.1.1       survival_2.37-7    
## [43] tools_3.1.1         tseries_0.10-32     whisker_0.3-2      
## [46] yaml_2.1.13